The following are the necessary R-packages for running the code below:
library(tidyverse)
library(readr)
library(RColorBrewer)
# be sure to run install.packages("BiocManager") and use BiocManager::install("package") order to install and run the following libraries
library(DESeq2)
library(apeglm)
library(ashr)
library(pheatmap)| GeneID | Length | Col01_1_1 | Col01_1_2 | Col01_1_3 | Col01_2_1 | Col01_2_2 | Col01_2_3 | Col01_3_1 | Col01_3_2 | Col01_3_3 | Col01_4_1 | Col01_4_2 | Col01_4_3 | Col01_5_1 | Col01_5_2 | Col01_5_3 | ST35_1_1 | ST35_1_2 | ST35_1_3 | ST35_2_1 | ST35_2_2 | ST35_2_3 | ST35_3_1 | ST35_3_2 | ST35_3_3 | ST35_4_1 | ST35_4_2 | ST35_4_3 | ST35_5_1 | ST35_5_2 | ST35_5_3 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AT1G01010.1 | 1688 | 179 | 242 | 121 | 160 | 247 | 188 | 398 | 601 | 373 | 568 | 820 | 855 | 586 | 800 | 892 | 334 | 152 | 90 | 350 | 396 | 288 | 576 | 577 | 786 | 613 | 712 | 777 | 927 | 936 | 972 |
| AT1G01020.1 | 1623 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 2 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 3 | 0 | 0 | 6 | 0 |
| AT1G01020.2 | 1085 | 15 | 8 | 4 | 23 | 3 | 10 | 4 | 2 | 7 | 8 | 13 | 10 | 9 | 5 | 19 | 19 | 5 | 5 | 6 | 6 | 6 | 23 | 12 | 15 | 13 | 13 | 5 | 10 | 12 | 11 |
| AT1G01030.1 | 1905 | 131 | 100 | 111 | 72 | 78 | 72 | 94 | 88 | 96 | 80 | 93 | 96 | 52 | 85 | 113 | 119 | 81 | 93 | 75 | 78 | 77 | 93 | 116 | 119 | 72 | 80 | 80 | 70 | 100 | 123 |
| AT1G01040.1 | 6251 | 148 | 90 | 99 | 91 | 128 | 85 | 149 | 147 | 156 | 189 | 212 | 188 | 154 | 214 | 178 | 178 | 121 | 125 | 109 | 84 | 142 | 176 | 156 | 189 | 186 | 160 | 227 | 207 | 243 | 209 |
| AT1G01040.2 | 5877 | 5 | 5 | 7 | 8 | 8 | 15 | 9 | 17 | 13 | 23 | 20 | 26 | 16 | 20 | 14 | 16 | 11 | 6 | 19 | 10 | 21 | 12 | 10 | 23 | 31 | 27 | 35 | 34 | 24 | 13 |
| AT1G01046.1 | 207 | 60 | 42 | 29 | 55 | 39 | 32 | 54 | 28 | 40 | 32 | 27 | 38 | 28 | 25 | 55 | 79 | 37 | 72 | 67 | 51 | 54 | 53 | 39 | 73 | 54 | 34 | 27 | 18 | 54 | 60 |
| AT1G01050.1 | 976 | 2117 | 2075 | 1767 | 1775 | 1972 | 2081 | 1944 | 2000 | 1836 | 1588 | 2382 | 2225 | 1616 | 2045 | 2335 | 2428 | 1743 | 2118 | 2664 | 2345 | 2029 | 2272 | 2362 | 2716 | 1961 | 2034 | 2190 | 2285 | 2577 | 2346 |
| AT1G01060.1 | 2704 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| AT1G01060.2 | 2814 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
I created a data frame that matches the sample info provided by
Sanju’s PowerPoint slide as shown here:
The following code creates the data frame itself, which will be used as colData in DESeq2.
# extract column names for the samples (exclude non-sample columns)
sample_names <- colnames(feature_counts)[-c(1, 2)] # "GeneID" and "Length" are the first two columns (non-sample columns)
# map sample names treatment and genotype columns, respectively
colData <- data.frame(
sample_name = sample_names,
treatment = case_when(
grepl("^Col01_1", sample_names) ~ "0",
grepl("^Col01_2", sample_names) ~ "20",
grepl("^Col01_3", sample_names) ~ "40",
grepl("^Col01_4", sample_names) ~ "60",
grepl("^Col01_5", sample_names) ~ "80",
grepl("^ST35_1", sample_names) ~ "0",
grepl("^ST35_2", sample_names) ~ "20",
grepl("^ST35_3", sample_names) ~ "40",
grepl("^ST35_4", sample_names) ~ "60",
grepl("^ST35_5", sample_names) ~ "80",
TRUE ~ NA_character_ # default if no pattern matches
),
genotype = case_when(
grepl("Col", sample_names, ignore.case = TRUE) ~ "wild_type",
grepl("ST35", sample_names, ignore.case = TRUE) ~ "knockout_line",
TRUE ~ NA_character_ # default if no pattern matches
),
row.names = sample_names # set sample_name column content as row names
)
# combine 'genotype' and 'treatment' into a new 'group' column with shortened genotypes
colData$group <- paste(
case_when(
colData$genotype == "wild_type" ~ "wt",
colData$genotype == "knockout_line" ~ "ko",
TRUE ~ NA_character_
),
colData$treatment,
sep = "_"
)
# convert appropriate columns into factors and remove sample_name column
colData <- colData |>
mutate(
treatment = factor(treatment),
genotype = factor(genotype),
group = factor(group)
) |>
select(-sample_name) # remove sample_name column
# verify data structure
str(colData)| treatment | genotype | group | |
|---|---|---|---|
| Col01_1_1 | 0 | wild_type | wt_0 |
| Col01_1_2 | 0 | wild_type | wt_0 |
| Col01_1_3 | 0 | wild_type | wt_0 |
| Col01_2_1 | 20 | wild_type | wt_20 |
| Col01_2_2 | 20 | wild_type | wt_20 |
| Col01_2_3 | 20 | wild_type | wt_20 |
| Col01_3_1 | 40 | wild_type | wt_40 |
| Col01_3_2 | 40 | wild_type | wt_40 |
| Col01_3_3 | 40 | wild_type | wt_40 |
| Col01_4_1 | 60 | wild_type | wt_60 |
Let’s verify that the row names in colData matches the column names in feature_counts and are in the same order.
## [1] FALSE
The result is FALSE; this is due to the initial two columns found in the feature_counts for GeneID and Length. Let’s remove the Length column and convert the GeneID column into the row names of this data frame (it shouldn’t interfere with the DESeq2 output)!
# converting GeneID to rownames and removing Length column found in original feature count data set
data_counts <- feature_counts |>
column_to_rownames("GeneID") |>
select(-Length)
# checking for matching names
all(colnames(data_counts) %in% rownames(colData))## [1] TRUE
## [1] TRUE
| Col01_1_1 | Col01_1_2 | Col01_1_3 | Col01_2_1 | Col01_2_2 | Col01_2_3 | Col01_3_1 | Col01_3_2 | Col01_3_3 | Col01_4_1 | Col01_4_2 | Col01_4_3 | Col01_5_1 | Col01_5_2 | Col01_5_3 | ST35_1_1 | ST35_1_2 | ST35_1_3 | ST35_2_1 | ST35_2_2 | ST35_2_3 | ST35_3_1 | ST35_3_2 | ST35_3_3 | ST35_4_1 | ST35_4_2 | ST35_4_3 | ST35_5_1 | ST35_5_2 | ST35_5_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AT1G01010.1 | 179 | 242 | 121 | 160 | 247 | 188 | 398 | 601 | 373 | 568 | 820 | 855 | 586 | 800 | 892 | 334 | 152 | 90 | 350 | 396 | 288 | 576 | 577 | 786 | 613 | 712 | 777 | 927 | 936 | 972 |
| AT1G01020.1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 2 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 3 | 0 | 0 | 6 | 0 |
| AT1G01020.2 | 15 | 8 | 4 | 23 | 3 | 10 | 4 | 2 | 7 | 8 | 13 | 10 | 9 | 5 | 19 | 19 | 5 | 5 | 6 | 6 | 6 | 23 | 12 | 15 | 13 | 13 | 5 | 10 | 12 | 11 |
| AT1G01030.1 | 131 | 100 | 111 | 72 | 78 | 72 | 94 | 88 | 96 | 80 | 93 | 96 | 52 | 85 | 113 | 119 | 81 | 93 | 75 | 78 | 77 | 93 | 116 | 119 | 72 | 80 | 80 | 70 | 100 | 123 |
| AT1G01040.1 | 148 | 90 | 99 | 91 | 128 | 85 | 149 | 147 | 156 | 189 | 212 | 188 | 154 | 214 | 178 | 178 | 121 | 125 | 109 | 84 | 142 | 176 | 156 | 189 | 186 | 160 | 227 | 207 | 243 | 209 |
| AT1G01040.2 | 5 | 5 | 7 | 8 | 8 | 15 | 9 | 17 | 13 | 23 | 20 | 26 | 16 | 20 | 14 | 16 | 11 | 6 | 19 | 10 | 21 | 12 | 10 | 23 | 31 | 27 | 35 | 34 | 24 | 13 |
| AT1G01046.1 | 60 | 42 | 29 | 55 | 39 | 32 | 54 | 28 | 40 | 32 | 27 | 38 | 28 | 25 | 55 | 79 | 37 | 72 | 67 | 51 | 54 | 53 | 39 | 73 | 54 | 34 | 27 | 18 | 54 | 60 |
| AT1G01050.1 | 2117 | 2075 | 1767 | 1775 | 1972 | 2081 | 1944 | 2000 | 1836 | 1588 | 2382 | 2225 | 1616 | 2045 | 2335 | 2428 | 1743 | 2118 | 2664 | 2345 | 2029 | 2272 | 2362 | 2716 | 1961 | 2034 | 2190 | 2285 | 2577 | 2346 |
| AT1G01060.1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| AT1G01060.2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
As we can see, it now returns TRUE for both conditions (matching names and same order).
1/21/25
We will now be creating our DESeqDataSet object using the DESeq2 package and our recently updated and newly created data frames titled data_counts and colData, respectively.
# creating DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = data_counts,
colData = colData,
design = ~ treatment + genotype + treatment:genotype)
dds ## class: DESeqDataSet
## dim: 40745 30
## metadata(1): version
## assays(1): counts
## rownames(40745): AT1G01010.1 AT1G01020.1 ... ATMG01400.1 ATMG01410.1
## rowData names(0):
## colnames(30): Col01_1_1 Col01_1_2 ... ST35_5_2 ST35_5_3
## colData names(3): treatment genotype group
We have a total of 40745 rows.
Let’s now pre-filter the dds to remove those entries that have less than 10 read counts (this is an arbitrary number I’ve simply chosen according to Dr. Liang’s advice).
# remove low gene counts (<10)
keep_counts <- rowSums(counts(dds)) >= 10
dds <- dds[keep_counts,]
dds## class: DESeqDataSet
## dim: 27710 30
## metadata(1): version
## assays(1): counts
## rownames(27710): AT1G01010.1 AT1G01020.1 ... ATMG01400.1 ATMG01410.1
## rowData names(0):
## colnames(30): Col01_1_1 Col01_1_2 ... ST35_5_2 ST35_5_3
## colData names(3): treatment genotype group
We have filtered down to 27710 rows.
1/21/25
We now want to re-level and compare different treatment conditions to one another. Here are the following variations of the comparisons we will conduct:
Note: we don’t have to necessarily state which reference level we want to use. DESeq2 will simply choose alphabetically which level to use.
# use wild_type control as reference baseline
dds$treatment <- relevel(dds$treatment, ref = "0")
dds$genotype <- relevel(dds$genotype, ref = "wild_type")
# use the control as reference for every single dds object I will be making
# you may want to set this to a different variable object so that we can keep reusing our set up data and not overwrite it every time (double-check with Dr. Liang if I can do that because this doesn't seem t make new variable anyway)IMPORTANT NOTE: Make sure to collapse the technical replicates before analysis; however, do not collapse biological replicates! (I don’t really know what this means though)
1/21/25
We can now run DESeq2 and see if our data structure works!
# run the differential gene expression analysis
dds <- DESeq(dds)
# view what conditions were contrasted to make sure they match what we actually meant to compare
resultsNames(dds)## [1] "Intercept" "treatment_20_vs_0"
## [3] "treatment_40_vs_0" "treatment_60_vs_0"
## [5] "treatment_80_vs_0" "genotype_knockout_line_vs_wild_type"
## [7] "treatment20.genotypeknockout_line" "treatment40.genotypeknockout_line"
## [9] "treatment60.genotypeknockout_line" "treatment80.genotypeknockout_line"
## log2 fold change (MLE): treatment80.genotypeknockout line
## Wald test p-value: treatment80.genotypeknockout line
## DataFrame with 27710 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## AT1G01010.1 481.647863 -0.0360314 0.340649 -0.105773 0.915763 0.978489
## AT1G01020.1 0.699483 -0.6273926 4.139936 -0.151546 0.879545 NA
## AT1G01020.2 9.829772 -0.3999154 0.965505 -0.414204 0.678725 NA
## AT1G01030.1 90.232278 0.1472727 0.277684 0.530360 0.595862 0.871599
## AT1G01040.1 153.217100 -0.3981934 0.282761 -1.408232 0.159063 0.519782
## ... ... ... ... ... ... ...
## ATMG01370.1 0.800246 4.29449 3.98368 1.078021 0.2810245 NA
## ATMG01380.1 0.343591 1.87808 9.36998 0.200435 0.8411400 NA
## ATMG01390.1 3.526977 1.61474 2.06880 0.780519 0.4350853 NA
## ATMG01400.1 5.379710 1.87345 1.80558 1.037589 0.2994614 NA
## ATMG01410.1 11.977327 -2.23519 1.24253 -1.798904 0.0720339 0.354658
1/28/25
The following code chunk is the normalization of the data’s read counts for future use:
## Col01_1_1 Col01_1_2 Col01_1_3 Col01_2_1 Col01_2_2 Col01_2_3 Col01_3_1 Col01_3_2
## 1.0592495 0.9060561 0.7694873 0.7654071 0.8409846 0.7989893 0.9593589 1.0299932
## Col01_3_3 Col01_4_1 Col01_4_2 Col01_4_3 Col01_5_1 Col01_5_2 Col01_5_3 ST35_1_1
## 1.0102252 0.8769932 1.2538535 1.2085315 0.8529351 1.0513209 1.1874095 1.0163931
## ST35_1_2 ST35_1_3 ST35_2_1 ST35_2_2 ST35_2_3 ST35_3_1 ST35_3_2 ST35_3_3
## 0.8397445 0.8019333 1.0863337 0.9429100 0.8752783 1.0910270 1.2287452 1.3388948
## ST35_4_1 ST35_4_2 ST35_4_3 ST35_5_1 ST35_5_2 ST35_5_3
## 0.9652094 1.0114658 1.1666743 1.1550933 1.3651549 1.2601360
normalized_counts <- counts(dds, normalized = TRUE)
# make a box plot using normalized counts
boxplot(log10(normalized_counts))1/21/25
Let’s begin exploring the results of the differential gene expression analysis by summarizing the results.
# summary results of dds (with a re-leveling of wild_type control vs. each treatment, and interaction between treatment and genotype)
summary(res_dds)##
## out of 27710 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 892, 3.2%
## LFC < 0 (down) : 716, 2.6%
## outliers [1] : 10, 0.036%
## low counts [2] : 6982, 25%
## (mean count < 12)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## out of 27710 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 589, 2.1%
## LFC < 0 (down) : 419, 1.5%
## outliers [1] : 10, 0.036%
## low counts [2] : 7520, 27%
## (mean count < 14)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
We can also use DESeq2’s contrast function to compare specific factor levels after re-leveling. This allows us to make all the necessary comparisons we need without having to re-run the DESeq function with different parameters every time.
Below are the comparison results between both the ____ and the ____:
The struggle here is that I need to be able to be more exact with my comparisons; how can I compare wild_type_0 to knockout_line_0 when the meta-data is contained in different columns (something the contrast function does not accommodate for).
#USEFUL FOR REFERENCE ON USING BASE CONTRAST() FUNCTION BUT THIS CODE CHUNK DOES NOT RUN
# contrast different factor levels (i.e. each treatment vs. ST-35, Col-0 vs. ST-35, etc.)
res_0_80 <- results(dds, contrast = c("treatment", "0", "80"), alpha = 0.05) # p-value of 0.05; this alpha = 0.05 does not work
# do shrinkage after each individual contrast
# convert DESeq2 results to a data frame
res_0_80_df <- as.data.frame(res_0_80)
# add the gene IDs as a column in the data frame
res_0_80_df$GeneID <- rownames(res_0_80_df)
# reorder columns to place GeneID as the first column
res_0_80_df <- res_0_80_df[, c("GeneID", colnames(res_0_80_df)[-ncol(res_0_80_df)])]
# write output to a csv file
write.csv(res_0_80_df, "../data/DESeq2_results_0_vs_80.csv", row.names = FALSE) # filter and save only those values have are < 0.05 (get creative with padj in the res_0_80_df)The following are different ways we can visualize the results of our differential gene expression analysis.
1/21/25
Make sure to use the the normalized data counts and not the DESeq result output!
# create a data frame with PCA results and sample metadata
pca_dat <- as.data.frame(pca$x) # PCA scores for each sample
pca_dat$sample <- rownames(pca_dat) # sample names
pca_dat$treatment <- colData(dds)$treatment # grouping variable (treatment condition)
pca_dat$genotype <- colData(dds)$genotype # add genotype to the PCA data# plot the first two principal components
ggplot(pca_dat, aes(x = PC1, y = PC2, color = treatment, shape = genotype, label = sample)) +
geom_point(size = 3) +
geom_text(vjust = -1, hjust = 0.5, size = 3) +
labs(
title = "PCA Plot of Arabidopsis Selenium Treatment:\nSulfur Transport Knockout-Line and Wild-Type Samples",
x = paste0("PC1: ", round(summary(pca)$importance[2, 1] * 100, 1), "% Variance"),
y = paste0("PC2: ", round(summary(pca)$importance[2, 2] * 100, 1), "% Variance"),
color = "Treatment",
shape = "Genotype"
) +
theme_minimal()1/28/25
# Step 1: Compute results for the contrast
res <- results(dds, contrast = c("genotype", "knockout_line", "wild_type"))
# Step 2: Apply shrinkage to the results
res_shrink <- lfcShrink(dds, coef = "genotype_knockout_line_vs_wild_type", type = "apeglm")
# check results
summary(res_shrink)##
## out of 27710 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1513, 5.5%
## LFC < 0 (down) : 1902, 6.9%
## outliers [1] : 10, 0.036%
## low counts [2] : 6445, 23%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# create MA plot using ggplot2
ggplot(pca_dat_shrink, aes(x = baseMean, y = log2FoldChange, color = significance)) +
geom_point(size = 1) +
scale_color_manual(values = c("Significant" = "blue", "Not Significant" = "black")) +
scale_x_log10() + # Apply log scale to x-axis (baseMean)
labs(title = "MA Plot: Differential Expression with Shrinkage",
x = "Mean of Normalized Counts (log scale)",
y = "Shrunken Log2 Fold Change") +
theme_minimal()2/3/25 - 2/4/25
I will be subsetting our properly structured ‘colData’ data frame produced earlier in this script (found under “1. Preparing Count Data”) as well as the ‘data_counts’ data frame to match column and rownames. This will allow us to properly utilize the contrast() function DESeq2 offers, giving us the ability to more dynamically compare our samples by treating the varying columns as independent factors.
As a reminder, we will be making many comparisons, of which are as follows:
We will be using a total of 4 colData and data_counts variants (including the original colData and data_counts created earlier in this script) that will utilize either the ‘treatment’ or ‘group’ column as their contrast arguments depending on the type of specificity our colData allows.
| treatment | genotype | group | |
|---|---|---|---|
| Col01_1_1 | 0 | wild_type | wt_0 |
| Col01_1_2 | 0 | wild_type | wt_0 |
| Col01_1_3 | 0 | wild_type | wt_0 |
| Col01_2_1 | 20 | wild_type | wt_20 |
| Col01_2_2 | 20 | wild_type | wt_20 |
| Col01_2_3 | 20 | wild_type | wt_20 |
| Col01_1_1 | Col01_1_2 | Col01_1_3 | Col01_2_1 | Col01_2_2 | Col01_2_3 | Col01_3_1 | Col01_3_2 | Col01_3_3 | Col01_4_1 | Col01_4_2 | Col01_4_3 | Col01_5_1 | Col01_5_2 | Col01_5_3 | ST35_1_1 | ST35_1_2 | ST35_1_3 | ST35_2_1 | ST35_2_2 | ST35_2_3 | ST35_3_1 | ST35_3_2 | ST35_3_3 | ST35_4_1 | ST35_4_2 | ST35_4_3 | ST35_5_1 | ST35_5_2 | ST35_5_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AT1G01010.1 | 179 | 242 | 121 | 160 | 247 | 188 | 398 | 601 | 373 | 568 | 820 | 855 | 586 | 800 | 892 | 334 | 152 | 90 | 350 | 396 | 288 | 576 | 577 | 786 | 613 | 712 | 777 | 927 | 936 | 972 |
| AT1G01020.1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 2 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 3 | 0 | 0 | 6 | 0 |
| AT1G01020.2 | 15 | 8 | 4 | 23 | 3 | 10 | 4 | 2 | 7 | 8 | 13 | 10 | 9 | 5 | 19 | 19 | 5 | 5 | 6 | 6 | 6 | 23 | 12 | 15 | 13 | 13 | 5 | 10 | 12 | 11 |
| AT1G01030.1 | 131 | 100 | 111 | 72 | 78 | 72 | 94 | 88 | 96 | 80 | 93 | 96 | 52 | 85 | 113 | 119 | 81 | 93 | 75 | 78 | 77 | 93 | 116 | 119 | 72 | 80 | 80 | 70 | 100 | 123 |
| AT1G01040.1 | 148 | 90 | 99 | 91 | 128 | 85 | 149 | 147 | 156 | 189 | 212 | 188 | 154 | 214 | 178 | 178 | 121 | 125 | 109 | 84 | 142 | 176 | 156 | 189 | 186 | 160 | 227 | 207 | 243 | 209 |
| AT1G01040.2 | 5 | 5 | 7 | 8 | 8 | 15 | 9 | 17 | 13 | 23 | 20 | 26 | 16 | 20 | 14 | 16 | 11 | 6 | 19 | 10 | 21 | 12 | 10 | 23 | 31 | 27 | 35 | 34 | 24 | 13 |
Display does not obviously show that all rows/columns are included.
| treatment | genotype | group | |
|---|---|---|---|
| Col01_1_1 | 0 | wild_type | wt_0 |
| Col01_1_2 | 0 | wild_type | wt_0 |
| Col01_1_3 | 0 | wild_type | wt_0 |
| ST35_1_1 | 0 | knockout_line | ko_0 |
| ST35_1_2 | 0 | knockout_line | ko_0 |
| ST35_1_3 | 0 | knockout_line | ko_0 |
| Col01_1_1 | Col01_1_2 | Col01_1_3 | ST35_1_1 | ST35_1_2 | ST35_1_3 | |
|---|---|---|---|---|---|---|
| AT1G01010.1 | 179 | 242 | 121 | 334 | 152 | 90 |
| AT1G01020.1 | 0 | 0 | 0 | 0 | 0 | 0 |
| AT1G01020.2 | 15 | 8 | 4 | 19 | 5 | 5 |
| AT1G01030.1 | 131 | 100 | 111 | 119 | 81 | 93 |
| AT1G01040.1 | 148 | 90 | 99 | 178 | 121 | 125 |
| AT1G01040.2 | 5 | 5 | 7 | 16 | 11 | 6 |
| treatment | genotype | group | |
|---|---|---|---|
| Col01_1_1 | 0 | wild_type | wt_0 |
| Col01_1_2 | 0 | wild_type | wt_0 |
| Col01_1_3 | 0 | wild_type | wt_0 |
| Col01_2_1 | 20 | wild_type | wt_20 |
| Col01_2_2 | 20 | wild_type | wt_20 |
| Col01_2_3 | 20 | wild_type | wt_20 |
| Col01_1_1 | Col01_1_2 | Col01_1_3 | Col01_2_1 | Col01_2_2 | Col01_2_3 | Col01_3_1 | Col01_3_2 | Col01_3_3 | Col01_4_1 | Col01_4_2 | Col01_4_3 | Col01_5_1 | Col01_5_2 | Col01_5_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AT1G01010.1 | 179 | 242 | 121 | 160 | 247 | 188 | 398 | 601 | 373 | 568 | 820 | 855 | 586 | 800 | 892 |
| AT1G01020.1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 2 | 1 | 4 |
| AT1G01020.2 | 15 | 8 | 4 | 23 | 3 | 10 | 4 | 2 | 7 | 8 | 13 | 10 | 9 | 5 | 19 |
| AT1G01030.1 | 131 | 100 | 111 | 72 | 78 | 72 | 94 | 88 | 96 | 80 | 93 | 96 | 52 | 85 | 113 |
| AT1G01040.1 | 148 | 90 | 99 | 91 | 128 | 85 | 149 | 147 | 156 | 189 | 212 | 188 | 154 | 214 | 178 |
| AT1G01040.2 | 5 | 5 | 7 | 8 | 8 | 15 | 9 | 17 | 13 | 23 | 20 | 26 | 16 | 20 | 14 |
| treatment | genotype | group | |
|---|---|---|---|
| ST35_1_1 | 0 | knockout_line | ko_0 |
| ST35_1_2 | 0 | knockout_line | ko_0 |
| ST35_1_3 | 0 | knockout_line | ko_0 |
| ST35_2_1 | 20 | knockout_line | ko_20 |
| ST35_2_2 | 20 | knockout_line | ko_20 |
| ST35_2_3 | 20 | knockout_line | ko_20 |
| ST35_1_1 | ST35_1_2 | ST35_1_3 | ST35_2_1 | ST35_2_2 | ST35_2_3 | ST35_3_1 | ST35_3_2 | ST35_3_3 | ST35_4_1 | ST35_4_2 | ST35_4_3 | ST35_5_1 | ST35_5_2 | ST35_5_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AT1G01010.1 | 334 | 152 | 90 | 350 | 396 | 288 | 576 | 577 | 786 | 613 | 712 | 777 | 927 | 936 | 972 |
| AT1G01020.1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 3 | 0 | 0 | 6 | 0 |
| AT1G01020.2 | 19 | 5 | 5 | 6 | 6 | 6 | 23 | 12 | 15 | 13 | 13 | 5 | 10 | 12 | 11 |
| AT1G01030.1 | 119 | 81 | 93 | 75 | 78 | 77 | 93 | 116 | 119 | 72 | 80 | 80 | 70 | 100 | 123 |
| AT1G01040.1 | 178 | 121 | 125 | 109 | 84 | 142 | 176 | 156 | 189 | 186 | 160 | 227 | 207 | 243 | 209 |
| AT1G01040.2 | 16 | 11 | 6 | 19 | 10 | 21 | 12 | 10 | 23 | 31 | 27 | 35 | 34 | 24 | 13 |
I’ve written a function that allows for easy manipulation and production of ‘dds’ objects using the DESeqDataSetFromMatrix() function. It seems very similar to simply using the DESeq2 function itself, however, this allows for cleaner production of the dds objects and includes chosen filtering of gene counts. It also includes error messages to gracefully halt improper inputs.
This function only works if the DESeq2 package is installed and running on the same script.
dds_generator <- function(read_counts, column_data, design_parameter, filter_low_counts = NULL) {
if (!is.character(design_parameter)) {
stop("'design_parameter' must be a character string representing the design formula.")
}
# validate design variables exist in column_data
design_vars <- unlist(strsplit(design_parameter, " \\+ | \\* "))
missing_vars <- setdiff(design_vars, colnames(column_data))
if (length(missing_vars) > 0) {
stop("The following design variables are missing in 'column_data': ", paste(missing_vars, collapse = ", "))
}
# convert design parameter input into formula
design_formula <- as.formula(paste("~", design_parameter))
# creating DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = read_counts,
colData = column_data,
design = design_formula)
# remove low gene counts only if a filtering threshold is specified
if (!is.null(filter_low_counts) && is.numeric(filter_low_counts) && filter_low_counts > 0) {
keep_counts <- rowSums(counts(dds)) >= filter_low_counts
dds <- dds[keep_counts,]
}
return(dds)
}Using all of the particular colData sub-sets, I will be producing varying ‘dds’ objects that will have different design formulas depending on how specific we need the comparisons between samples to be. These objects will then be implemented with the contrast function for more specific comparisons within each subset.
All ‘dds’ objects will be factor re-leveled to reference the baseline which is the wild-type control sample (treatment = 0 and genotype = wild_type or group = wild_type_0). This will be the same for every analyses.
dds_treatment <- dds_generator(data_counts, colData, "group", 10)
# set reference level: group to wt_0
dds_treatment$group <- relevel(dds_treatment$group, ref = "wt_0")
dds_treatment## class: DESeqDataSet
## dim: 27710 30
## metadata(1): version
## assays(1): counts
## rownames(27710): AT1G01010.1 AT1G01020.1 ... ATMG01400.1 ATMG01410.1
## rowData names(0):
## colnames(30): Col01_1_1 Col01_1_2 ... ST35_5_2 ST35_5_3
## colData names(3): treatment genotype group
dds_controls <- dds_generator(data_counts_controls, colData_controls, "genotype", 10)
# set reference level: genotype to wild_type
dds_controls$genotype <- relevel(dds_controls$genotype, ref = "wild_type")
dds_controls## class: DESeqDataSet
## dim: 23973 6
## metadata(1): version
## assays(1): counts
## rownames(23973): AT1G01010.1 AT1G01020.2 ... ATMG01400.1 ATMG01410.1
## rowData names(0):
## colnames(6): Col01_1_1 Col01_1_2 ... ST35_1_2 ST35_1_3
## colData names(3): treatment genotype group
dds_wt <- dds_generator(data_counts_wt, colData_wt, "treatment", 10)
# set reference level: treatment to 0
dds_wt$treatment <- relevel(dds_wt$treatment, ref = "0")
dds_wt## class: DESeqDataSet
## dim: 26602 15
## metadata(1): version
## assays(1): counts
## rownames(26602): AT1G01010.1 AT1G01020.1 ... ATMG01400.1 ATMG01410.1
## rowData names(0):
## colnames(15): Col01_1_1 Col01_1_2 ... Col01_5_2 Col01_5_3
## colData names(3): treatment genotype group
dds_ko <- dds_generator(data_counts_ko, colData_ko, "treatment", 10)
# set reference level: treatment to 0
dds_ko$treatment <- relevel(dds_ko$treatment, ref = "0")
dds_ko## class: DESeqDataSet
## dim: 26818 15
## metadata(1): version
## assays(1): counts
## rownames(26818): AT1G01010.1 AT1G01020.1 ... ATMG01400.1 ATMG01410.1
## rowData names(0):
## colnames(15): ST35_1_1 ST35_1_2 ... ST35_5_2 ST35_5_3
## colData names(3): treatment genotype group
To allow for easier reproducibility and less redundancy in the code, I’ve written two functions that allow for producing dynamic contrasts that utilize either apeglm shrinkage with coefficients or ashr shrinkage with contrasts, and writes the output to a dynamically named CSV file.
As mentioned, the coef_shrinkage_contrast() function using apeglm shrinkage and coefficients from the ‘dds’ levels. This is especially useful to maintain indepdence amongst comparisons and the factor levels.
This function also dynamically incorporates the proper coefficient using the column, level_1, and priority_level_2 parameters. It also has a built-in check to make sure the coefficient exists in said dds object.
This function requires the apeglm package installed and running on the same script to work.
coef_shrinkage_contrast <- function(dds_object, column, level_1, priority_level_2, output_dir = "../data/arabidopsis_deseq2_results/") {
# construct the expected coefficient name
coef_name <- paste0(column, "_", level_1, "_vs_", priority_level_2)
# ensure the coefficient exists
dds_name <- deparse(substitute(dds_object)) # grab the object's name as a string
if (!(coef_name %in% resultsNames(dds_object))) {
stop(paste("Coefficient", coef_name, "not found in object", dds_name, ". Check resultsNames(", dds_name, ")."))
}
# apply shrinkage using apeglm
shrink_result <- lfcShrink(dds_object, coef = coef_name, type = "apeglm")
# convert the shrinkage results to a data frame
result_df <- as.data.frame(shrink_result)
# add the gene IDs as a column in the data frame
result_df$GeneID <- rownames(result_df)
# re-order columns to place GeneID as the first column
result_df <- result_df[, c("GeneID", colnames(result_df)[-ncol(result_df)])]
# get the name of the dds_object (used for output_file name)
dds_name <- deparse(substitute(dds_object))
# generate dynamic output file name
output_file <- paste0(output_dir, "deseq2_", dds_name, "_", level_1, "_vs_", priority_level_2, ".csv")
# write output to a CSV file
write.csv(result_df, output_file, row.names = FALSE)
}The following ashr_shrinkage_contrast() function utilizes the ashr shrinkage estimator along with the contrast function to allow mutability in contrasting factor levels that are were not originally set during factor re-leveling. In other words, we can compare different factor levels that are do not involve the reference level.
All of the parameters used in the function above are used and represented the same in this one. The same checks are also in place.
This function requires the ashr package to be installed and running on the same script.
ashr_shrinkage_contrast <- function(dds_object, column, level_1, priority_level_2, output_dir = "../data/arabidopsis_deseq2_results/") {
# ensure the contrast levels exist
dds_name <- deparse(substitute(dds_object)) # grab the object's name as a string
factor_levels <- levels(colData(dds_object)[[column]]) # Get levels of the specified column
# check whether the levels exist in the factor column
if (!(level_1 %in% factor_levels) | !(priority_level_2 %in% factor_levels)) {
stop(paste("Levels", level_1, "or", priority_level_2, "not found in column", column, "of object", dds_name, ". Check levels(colData(",
dds_name, ")[[", column, "]])."))
}
# apply shrinkage using ashr and contrast
shrink_result <- lfcShrink(dds_object, contrast = c(column, level_1, priority_level_2), type = "ashr")
# convert the shrinkage results to a data frame
result_df <- as.data.frame(shrink_result)
# add the gene IDs as a column in the data frame
result_df$GeneID <- rownames(result_df)
# re-order columns to place GeneID as the first column
result_df <- result_df[, c("GeneID", colnames(result_df)[-ncol(result_df)])]
# generate dynamic output file name
output_file <- paste0(output_dir, "deseq2_", dds_name, "_", level_1, "_vs_", priority_level_2, ".csv")
# write output to a CSV file
write.csv(result_df, output_file, row.names = FALSE)
}Below will contain the code running all of the various DESeq2 analyses we need along with specific contrasts. Each contrast will be written to a CSV file that will be used for further analyses. There will be many files saved to a subdirectory within my ‘data’ directory.
# run the differential gene expression analysis
dds_treatment <- DESeq(dds_treatment)
# view what conditions were contrasted to make sure they match what we actually meant to compare
resultsNames(dds_treatment)## [1] "Intercept" "group_ko_0_vs_wt_0" "group_ko_20_vs_wt_0"
## [4] "group_ko_40_vs_wt_0" "group_ko_60_vs_wt_0" "group_ko_80_vs_wt_0"
## [7] "group_wt_20_vs_wt_0" "group_wt_40_vs_wt_0" "group_wt_60_vs_wt_0"
## [10] "group_wt_80_vs_wt_0"
# store the results in an object
res_dds_treatment <- results(dds_treatment)
# view DESeq2 results
res_dds_treatment## log2 fold change (MLE): group wt 80 vs wt 0
## Wald test p-value: group wt 80 vs wt 0
## DataFrame with 27710 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## AT1G01010.1 481.647863 1.889492 0.241198 7.833777 4.73430e-15
## AT1G01020.1 0.699483 3.450224 2.921736 1.180881 2.37650e-01
## AT1G01020.2 9.829772 0.135655 0.685272 0.197958 8.43078e-01
## AT1G01030.1 90.232278 -0.655917 0.196783 -3.333195 8.58548e-04
## AT1G01040.1 153.217100 0.534500 0.202396 2.640861 8.26957e-03
## ... ... ... ... ... ...
## ATMG01370.1 0.800246 -1.137773 2.912105 -0.390705 0.6960154
## ATMG01380.1 0.343591 0.785622 6.662562 0.117916 0.9061343
## ATMG01390.1 3.526977 2.062884 1.246262 1.655257 0.0978723
## ATMG01400.1 5.379710 1.215300 1.086761 1.118277 0.2634489
## ATMG01410.1 11.977327 1.988103 0.845472 2.351472 0.0186993
## padj
## <numeric>
## AT1G01010.1 6.72910e-14
## AT1G01020.1 3.62134e-01
## AT1G01020.2 9.05699e-01
## AT1G01030.1 2.75096e-03
## AT1G01040.1 2.09610e-02
## ... ...
## ATMG01370.1 0.7999480
## ATMG01380.1 NA
## ATMG01390.1 0.1764977
## ATMG01400.1 0.3929179
## ATMG01410.1 0.0427462
# summary results of res_dds_treatment (with re-level of group = wt_0)
summary(res_dds_treatment, 0.05)##
## out of 27710 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6533, 24%
## LFC < 0 (down) : 5605, 20%
## outliers [1] : 10, 0.036%
## low counts [2] : 538, 1.9%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
We will be using the ashr_shrinkage_contrast() function for the comparisons regarding dds_treatment as they have a reference level of wt_0 but require contrasts between treatments of both genotypes to each other, not to the controls. In other words, we will be comparing wt_20 vs. ko_20, wt_40 vs. ko_40, etc., in these ‘dds’ results. Therefore, we need the ability to be specific about our contrasts (using the contrast() function) and still be able to shrink the data which is not functional with apeglm, but is functional with ashr.
# run the differential gene expression analysis
dds_controls <- DESeq(dds_controls)
# view what conditions were contrasted to make sure they match what we actually meant to compare
resultsNames(dds_controls)## [1] "Intercept" "genotype_knockout_line_vs_wild_type"
# store the results in an object
res_dds_controls <- results(dds_controls)
# view DESeq2 results
res_dds_controls## log2 fold change (MLE): genotype knockout line vs wild type
## Wald test p-value: genotype knockout line vs wild type
## DataFrame with 23973 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## AT1G01010.1 180.09710 0.0909285 0.367753 0.247254 0.804712 0.959745
## AT1G01020.2 8.75265 0.1527107 0.890280 0.171531 0.863806 NA
## AT1G01030.1 104.59159 -0.1776354 0.263761 -0.673471 0.500647 0.848081
## AT1G01040.1 124.72601 0.3887219 0.255211 1.523142 0.127723 0.508203
## AT1G01040.2 8.25645 0.9542243 0.853555 1.117941 0.263592 NA
## ... ... ... ... ... ... ...
## ATMG01320.1 33.85034 0.1306066 0.462083 0.2826478 0.777447 0.953273
## ATMG01340.1 1.67730 1.1252154 1.893954 0.5941091 0.552439 NA
## ATMG01350.1 60.47710 0.0228543 0.398252 0.0573866 0.954237 0.991725
## ATMG01400.1 1.94285 -2.4218435 2.215832 -1.0929727 0.274406 NA
## ATMG01410.1 4.86897 -0.1278612 1.396298 -0.0915716 0.927038 NA
# summary results of res_dds_controls (with re-level of genotype = wild_type)
summary(res_dds_controls, 0.05)##
## out of 23973 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 486, 2%
## LFC < 0 (down) : 587, 2.4%
## outliers [1] : 28, 0.12%
## low counts [2] : 4183, 17%
## (mean count < 11)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
For the all of the remaining comparisons, we will be using the coef_shrinkage_contrast() function as the data for the dds_controls, dds_wt, and dds_ko objects have all been subset and re-leveled to reference the wild_type control variant where possible (dds_controls and dds_wt) or the knockout_line control variant in dds_ko. In these cases, we are comparing all instances specifically to the reference level, therefore, the intercepts are viable coefficients to use with the apeglm shrinkage estimator. Although we cannot use the contrast() function with apeglm.
The vignette’s on DESeq2 describe apeglm as a higher performance estimator than ashr. For now, I am using a mix for the sake of ease in repeated production of the resulting dds data sets. Depending on our future needs, we can apply either function to all the data to maintain consistency.
I honestly should not need to even contrast this as the only rows in the ‘dds_controls’ are the triplactes of the control groups for both the knockout_line and the wild_type variants. However, I have the run_shrinkage_contrast() function already and it seems to work pretty well, and incorporates shrinkage so we will just use it.
Will contrasting have any negative effect on the outcome of an already completed comparison?
# run the differential gene expression analysis
dds_wt <- DESeq(dds_wt)
# view what conditions were contrasted to make sure they match what we actually meant to compare
resultsNames(dds_wt)## [1] "Intercept" "treatment_20_vs_0" "treatment_40_vs_0"
## [4] "treatment_60_vs_0" "treatment_80_vs_0"
## log2 fold change (MLE): treatment 80 vs 0
## Wald test p-value: treatment 80 vs 0
## DataFrame with 26602 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## AT1G01010.1 437.237306 1.879602 0.204738 9.180516 4.29029e-20
## AT1G01020.1 0.672511 3.442177 2.433941 1.414240 1.57291e-01
## AT1G01020.2 9.278346 0.130042 0.762051 0.170648 8.64501e-01
## AT1G01030.1 89.749801 -0.667420 0.209511 -3.185610 1.44449e-03
## AT1G01040.1 143.898223 0.524872 0.204778 2.563133 1.03732e-02
## ... ... ... ... ... ...
## ATMG01350.1 81.75085 0.586523 0.318424 1.84196 0.06548108
## ATMG01360.1 5.36252 3.426499 1.239572 2.76426 0.00570523
## ATMG01390.1 4.51224 2.052131 1.233042 1.66428 0.09605587
## ATMG01400.1 6.88725 1.190211 0.895700 1.32880 0.18391237
## ATMG01410.1 15.96561 1.988374 0.772065 2.57540 0.01001252
## padj
## <numeric>
## AT1G01010.1 7.12325e-19
## AT1G01020.1 NA
## AT1G01020.2 9.11608e-01
## AT1G01030.1 4.10853e-03
## AT1G01040.1 2.42576e-02
## ... ...
## ATMG01350.1 0.1205520
## ATMG01360.1 0.0142260
## ATMG01390.1 0.1669259
## ATMG01400.1 0.2840578
## ATMG01410.1 0.0235037
# summary results of res_dds_wt (with re-level of treatment = 0 for wild_type)
summary(res_dds_wt, 0.05)##
## out of 26602 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6515, 24%
## LFC < 0 (down) : 5839, 22%
## outliers [1] : 19, 0.071%
## low counts [2] : 516, 1.9%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# run the differential gene expression analysis
dds_ko <- DESeq(dds_ko)
# view what conditions were contrasted to make sure they match what we actually meant to compare
resultsNames(dds_ko)## [1] "Intercept" "treatment_20_vs_0" "treatment_40_vs_0"
## [4] "treatment_60_vs_0" "treatment_80_vs_0"
## log2 fold change (MLE): treatment 80 vs 0
## Wald test p-value: treatment 80 vs 0
## DataFrame with 26818 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## AT1G01010.1 528.128492 1.848828 0.257871 7.169578 7.52291e-13
## AT1G01020.1 0.724119 2.813109 3.637539 0.773355 4.39312e-01
## AT1G01020.2 10.397760 -0.272373 0.651345 -0.418170 6.75823e-01
## AT1G01030.1 90.532212 -0.515260 0.219451 -2.347949 1.88771e-02
## AT1G01040.1 162.893461 0.130488 0.209979 0.621433 5.34315e-01
## ... ... ... ... ... ...
## ATMG01360.1 2.663776 1.83537 1.614998 1.136453 0.2557669
## ATMG01370.1 0.972828 3.15069 2.232032 1.411577 0.1580745
## ATMG01390.1 2.418109 3.67246 1.743147 2.106800 0.0351349
## ATMG01400.1 3.689406 3.10262 1.623886 1.910617 0.0560538
## ATMG01410.1 7.511026 -0.25594 0.979801 -0.261216 0.7939258
## padj
## <numeric>
## AT1G01010.1 9.38179e-12
## AT1G01020.1 NA
## AT1G01020.2 7.72073e-01
## AT1G01030.1 4.32020e-02
## AT1G01040.1 6.52509e-01
## ... ...
## ATMG01360.1 0.3750086
## ATMG01370.1 NA
## ATMG01390.1 0.0735021
## ATMG01400.1 0.1092704
## ATMG01410.1 0.8614655
# summary results of res_dds_ko (with re-level of treatment = 0 for knockout_line)
summary(res_dds_ko, 0.05)##
## out of 26818 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6266, 23%
## LFC < 0 (down) : 5259, 20%
## outliers [1] : 13, 0.048%
## low counts [2] : 1040, 3.9%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
2/5/25
The following are defined terms and acronyms that can be used to distinguish the CSV files produced via the ashr_shrinkage_contrast() and coef_shrinkage_contrast() functions.
Key Title Terms:
deseq2 = package used to run analysis
‘dds’ Object Names:
Contrast Parameters:
For example, one of the csv files produced from the above analyses had the title of: deseq2_dds_ko_20_vs_0.csv
Therefore, this CSV file contains the resulting contrast between the knockout_line sample treated with 20 uM and the knockout_line control sample with 0 uM.
2/18/25
Be sure to install and run the following libraries for the analysis:
# be sure to run install.packages("BiocManager") and use BiocManager::install("package") order to install and run the following libraries
library(clusterProfiler)
library(AnnotationDbi) # warning: will mask select() from dpylr
library(org.At.tair.db) # database used for arabidopsisIn order to conduct GO enrichment analysis, we will need to read in our produced CSV files (from section #7) as data frames. The following chunk of code will loop through our data folder that contains the CSV files and save them as individual data frames in our global environment for further use.
The code will keep the same name for the files but change the beginning portion of “deseq2_” to “df_” and will only read in those rows with a padj < 0.05. Be sure to also replace the file path when creating the data_folder variable to wherever your CSV files were saved (will be the same path as used in the coef and ashr functions if those were utilized).
# define the data folder path
data_folder <- "../data/arabidopsis_deseq2_results/" # change to your actual data folder path
# list all CSV files in the folder
csv_files <- list.files(data_folder, pattern = "\\.csv$", full.names = TRUE)
# avoid scientific notation
options(scipen = 999)
# loop through each file and create a filtered data frame (padj < 0.05)
for (file in csv_files) {
# extract file name without extension
df_name <- tools::file_path_sans_ext(basename(file))
# replace "deseq2_" with "df_" in the variable name (should always work if using the ashr and coef functions)
df_name <- sub("^deseq2_", "df_", df_name)
# read the CSV into a data frame
df <- read.csv(file, stringsAsFactors = FALSE)
# filter rows where column padj < 0.05 and remove rows where padj = NA
df <- df[!is.na(df$padj) & df$padj < 0.05, ]
# save the data frames to the global environment
assign(df_name, df, envir = .GlobalEnv)
}Function made to allow for modular Gene Ontology Enrichment analysis result output. The function’s parameters are as follows:
GO_results <- function(dataframe, up_or_down, database = "org.At.tair.db", key = "TAIR", ont_category = "BP") {
# remove any suffix (like ".1") from the GeneIDs (won't work otherwise)
dataframe$GeneID <- sub("\\..*", "", dataframe$GeneID)
# filter up-regulation or down-regulation based on up_or_down
if (up_or_down == "up") {
genes_to_test <- dataframe$GeneID[dataframe$log2FoldChange > 0]
} else if (up_or_down == "down") {
genes_to_test <- dataframe$GeneID[dataframe$log2FoldChange < 0]
} else {
stop("Invalid value for 'up_or_down'. Use 'up' or 'down'.")
}
# perform GO enrichment analysis
go_results <- enrichGO(gene = genes_to_test, OrgDb = database, keyType = key, ont = ont_category)
return(go_results)
}This function is used to convert the GO analysis results into a more reader-friendly data frame, write the contents to a CSV file, and then sort them into respective folders for either “up” or “down” regulation.
save_GO_results <- function(go_result, direction, file_path = "../data/arabidopsis_GO_results/") {
# convert the GO results object to a data frame
go_result_df <- as.data.frame(go_result)
# get the go_result variable name as a string
object_name <- deparse(substitute(go_result))
# define the directory based the gene regulation direction ("up" or "down")
output_folder <- paste0(file_path, direction, "_results/")
# create the folder if it doesn't exist
dir.create(output_folder, recursive = TRUE, showWarnings = FALSE)
# define the output file name using the go_result variable name (so name it something useful)
output_file <- paste0(output_folder, object_name, ".csv")
# write the data frame to a CSV file
write.csv(go_result_df, file = output_file, row.names = FALSE)
}The following code will utilize clusterProfiler, AnnotationDbi, the GO_results(), and save_GO_results() functions to annotate the genes found in each of the individual CSV files we have just read into data frames. We will select for both up-regulated (+log2FoldChange) and down-regulated (-log2FoldChange) for each CSV file. We will also be conducting this on each of the three gene ontology categories (biological processes, molecular functions, cellular components).
KO Treatment vs. WT Treatment:
# WT_20 vs. KO_20
up_BP_ko_20_vs_wt_20_GO <- GO_results(df_dds_treatment_ko_20_vs_wt_20, "up") # actual GO results (up-regulated)/default ont_category = "BP"
save_GO_results(up_BP_ko_20_vs_wt_20_GO, "up") # GO results converted to a dataframe and saved to a CSV file
down_BP_ko_20_vs_wt_20_GO <- GO_results(df_dds_treatment_ko_20_vs_wt_20, "down") # (down-regulated)
save_GO_results(down_BP_ko_20_vs_wt_20_GO, "down")
up_MF_ko_20_vs_wt_20_GO <- GO_results(df_dds_treatment_ko_20_vs_wt_20, "up", ont_category = "MF")
save_GO_results(up_MF_ko_20_vs_wt_20_GO, "up")
down_MF_ko_20_vs_wt_20_GO <- GO_results(df_dds_treatment_ko_20_vs_wt_20, "down", ont_category = "MF")
save_GO_results(down_MF_ko_20_vs_wt_20_GO, "down")
up_CC_ko_20_vs_wt_20_GO <- GO_results(df_dds_treatment_ko_20_vs_wt_20, "up", ont_category = "CC")
save_GO_results(up_CC_ko_20_vs_wt_20_GO, "up")
down_CC_ko_20_vs_wt_20_GO <- GO_results(df_dds_treatment_ko_20_vs_wt_20, "down", ont_category = "CC")
save_GO_results(down_CC_ko_20_vs_wt_20_GO, "down")
# WT_40 vs. KO_40
up_BP_ko_40_vs_wt_40_GO <- GO_results(df_dds_treatment_ko_40_vs_wt_40, "up")
save_GO_results(up_BP_ko_40_vs_wt_40_GO, "up")
down_BP_ko_40_vs_wt_40_GO <- GO_results(df_dds_treatment_ko_40_vs_wt_40, "down")
save_GO_results(down_BP_ko_40_vs_wt_40_GO, "down")
up_MF_ko_40_vs_wt_40_GO <- GO_results(df_dds_treatment_ko_40_vs_wt_40, "up", ont_category = "MF")
save_GO_results(up_MF_ko_40_vs_wt_40_GO, "up")
down_MF_ko_40_vs_wt_40_GO <- GO_results(df_dds_treatment_ko_40_vs_wt_40, "down", ont_category = "MF")
save_GO_results(down_MF_ko_40_vs_wt_40_GO, "down")
up_CC_ko_40_vs_wt_40_GO <- GO_results(df_dds_treatment_ko_40_vs_wt_40, "up", ont_category = "CC")
save_GO_results(up_CC_ko_40_vs_wt_40_GO, "up")
down_CC_ko_40_vs_wt_40_GO <- GO_results(df_dds_treatment_ko_40_vs_wt_40, "down", ont_category = "CC")
save_GO_results(down_CC_ko_40_vs_wt_40_GO, "down")
# WT_60 vs. KO_60
up_BP_ko_60_vs_wt_60_GO <- GO_results(df_dds_treatment_ko_60_vs_wt_60, "up")
save_GO_results(up_BP_ko_60_vs_wt_60_GO, "up")
down_BP_ko_60_vs_wt_60_GO <- GO_results(df_dds_treatment_ko_60_vs_wt_60, "down")
save_GO_results(down_BP_ko_60_vs_wt_60_GO, "down")
up_MF_ko_60_vs_wt_60_GO <- GO_results(df_dds_treatment_ko_60_vs_wt_60, "up", ont_category = "MF")
save_GO_results(up_MF_ko_60_vs_wt_60_GO, "up")
down_MF_ko_60_vs_wt_60_GO <- GO_results(df_dds_treatment_ko_60_vs_wt_60, "down", ont_category = "MF")
save_GO_results(down_MF_ko_60_vs_wt_60_GO, "down")
up_CC_ko_60_vs_wt_60_GO <- GO_results(df_dds_treatment_ko_60_vs_wt_60, "up", ont_category = "CC")
save_GO_results(up_CC_ko_60_vs_wt_60_GO, "up")
down_CC_ko_60_vs_wt_60_GO <- GO_results(df_dds_treatment_ko_60_vs_wt_60, "down", ont_category = "CC")
save_GO_results(down_CC_ko_60_vs_wt_60_GO, "down")
# WT_80 vs. KO_80
up_BP_ko_80_vs_wt_80_GO <- GO_results(df_dds_treatment_ko_80_vs_wt_80, "up")
save_GO_results(up_BP_ko_80_vs_wt_80_GO, "up")
down_BP_ko_80_vs_wt_80_GO <- GO_results(df_dds_treatment_ko_80_vs_wt_80, "down")
save_GO_results(down_BP_ko_80_vs_wt_80_GO, "down")
up_MF_ko_80_vs_wt_80_GO <- GO_results(df_dds_treatment_ko_80_vs_wt_80, "up", ont_category = "MF")
save_GO_results(up_MF_ko_80_vs_wt_80_GO, "up")
down_MF_ko_80_vs_wt_80_GO <- GO_results(df_dds_treatment_ko_80_vs_wt_80, "down", ont_category = "MF")
save_GO_results(down_MF_ko_80_vs_wt_80_GO, "down")
up_CC_ko_80_vs_wt_80_GO <- GO_results(df_dds_treatment_ko_80_vs_wt_80, "up", ont_category = "CC")
save_GO_results(up_CC_ko_80_vs_wt_80_GO, "up")
down_CC_ko_80_vs_wt_80_GO <- GO_results(df_dds_treatment_ko_80_vs_wt_80, "down", ont_category = "CC")
save_GO_results(down_CC_ko_80_vs_wt_80_GO, "down")Only Controls:
# WT_0 vs. KO_0
up_BP_control_ko_0_vs_wt_0_GO <- GO_results(df_dds_controls_knockout_line_vs_wild_type, "up")
save_GO_results(up_BP_control_ko_0_vs_wt_0_GO, "up")
down_BP_control_ko_0_vs_wt_0_GO <- GO_results(df_dds_controls_knockout_line_vs_wild_type, "down")
save_GO_results(down_BP_control_ko_0_vs_wt_0_GO, "down")
up_MF_control_ko_0_vs_wt_0_GO <- GO_results(df_dds_controls_knockout_line_vs_wild_type, "up", ont_category = "MF")
save_GO_results(up_MF_control_ko_0_vs_wt_0_GO, "up")
down_MF_control_ko_0_vs_wt_0_GO <- GO_results(df_dds_controls_knockout_line_vs_wild_type, "down", ont_category = "MF")
save_GO_results(down_MF_control_ko_0_vs_wt_0_GO, "down")
up_CC_control_ko_0_vs_wt_0_GO <- GO_results(df_dds_controls_knockout_line_vs_wild_type, "up", ont_category = "CC")
save_GO_results(up_CC_control_ko_0_vs_wt_0_GO, "up")
down_CC_control_ko_0_vs_wt_0_GO <- GO_results(df_dds_controls_knockout_line_vs_wild_type, "down", ont_category = "CC")
save_GO_results(down_CC_control_ko_0_vs_wt_0_GO, "down")Only Wild-Type Samples:
# WT_0 vs. WT_20
up_BP_wt_20_vs_0_GO <- GO_results(df_dds_wt_20_vs_0, "up")
save_GO_results(up_BP_wt_20_vs_0_GO, "up")
down_BP_wt_20_vs_0_GO <- GO_results(df_dds_wt_20_vs_0, "down")
save_GO_results(down_BP_wt_20_vs_0_GO, "down")
up_MF_wt_20_vs_0_GO <- GO_results(df_dds_wt_20_vs_0, "up", ont_category = "MF")
save_GO_results(up_MF_wt_20_vs_0_GO, "up")
down_MF_wt_20_vs_0_GO <- GO_results(df_dds_wt_20_vs_0, "down", ont_category = "MF")
save_GO_results(down_MF_wt_20_vs_0_GO, "down")
up_CC_wt_20_vs_0_GO <- GO_results(df_dds_wt_20_vs_0, "up", ont_category = "CC")
save_GO_results(up_CC_wt_20_vs_0_GO, "up")
down_CC_wt_20_vs_0_GO <- GO_results(df_dds_wt_20_vs_0, "down", ont_category = "CC")
save_GO_results(down_CC_wt_20_vs_0_GO, "down")
# WT_0 vs. WT_40
up_BP_wt_40_vs_0_GO <- GO_results(df_dds_wt_40_vs_0, "up")
save_GO_results(up_BP_wt_40_vs_0_GO, "up")
down_BP_wt_40_vs_0_GO <- GO_results(df_dds_wt_40_vs_0, "down")
save_GO_results(down_BP_wt_40_vs_0_GO, "down")
up_MF_wt_40_vs_0_GO <- GO_results(df_dds_wt_40_vs_0, "up", ont_category = "MF")
save_GO_results(up_MF_wt_40_vs_0_GO, "up")
down_MF_wt_40_vs_0_GO <- GO_results(df_dds_wt_40_vs_0, "down", ont_category = "MF")
save_GO_results(down_MF_wt_40_vs_0_GO, "down")
up_CC_wt_40_vs_0_GO <- GO_results(df_dds_wt_40_vs_0, "up", ont_category = "CC")
save_GO_results(up_CC_wt_40_vs_0_GO, "up")
down_CC_wt_40_vs_0_GO <- GO_results(df_dds_wt_40_vs_0, "down", ont_category = "CC")
save_GO_results(down_CC_wt_40_vs_0_GO, "down")
# WT_0 vs. WT_60
up_BP_wt_60_vs_0_GO <- GO_results(df_dds_wt_60_vs_0, "up")
save_GO_results(up_BP_wt_60_vs_0_GO, "up")
down_BP_wt_60_vs_0_GO <- GO_results(df_dds_wt_60_vs_0, "down")
save_GO_results(down_BP_wt_60_vs_0_GO, "down")
up_MF_wt_60_vs_0_GO <- GO_results(df_dds_wt_60_vs_0, "up", ont_category = "MF")
save_GO_results(up_MF_wt_60_vs_0_GO, "up")
down_MF_wt_60_vs_0_GO <- GO_results(df_dds_wt_60_vs_0, "down", ont_category = "MF")
save_GO_results(down_MF_wt_60_vs_0_GO, "down")
up_CC_wt_60_vs_0_GO <- GO_results(df_dds_wt_60_vs_0, "up", ont_category = "CC")
save_GO_results(up_CC_wt_60_vs_0_GO, "up")
down_CC_wt_60_vs_0_GO <- GO_results(df_dds_wt_60_vs_0, "down", ont_category = "CC")
save_GO_results(down_CC_wt_60_vs_0_GO, "down")
# WT_0 vs. WT_80
up_BP_wt_80_vs_0_GO <- GO_results(df_dds_wt_80_vs_0, "up")
save_GO_results(up_BP_wt_80_vs_0_GO, "up")
down_BP_wt_80_vs_0_GO <- GO_results(df_dds_wt_80_vs_0, "down")
save_GO_results(down_BP_wt_80_vs_0_GO, "down")
up_MF_wt_80_vs_0_GO <- GO_results(df_dds_wt_80_vs_0, "up", ont_category = "MF")
save_GO_results(up_MF_wt_80_vs_0_GO, "up")
down_MF_wt_80_vs_0_GO <- GO_results(df_dds_wt_80_vs_0, "down", ont_category = "MF")
save_GO_results(down_MF_wt_80_vs_0_GO, "down")
up_CC_wt_80_vs_0_GO <- GO_results(df_dds_wt_80_vs_0, "up", ont_category = "CC")
save_GO_results(up_CC_wt_80_vs_0_GO, "up")
down_CC_wt_80_vs_0_GO <- GO_results(df_dds_wt_80_vs_0, "down", ont_category = "CC")
save_GO_results(down_CC_wt_80_vs_0_GO, "down")Only Knockout-Line Samples:
# KO_0 vs. KO_20
up_BP_ko_20_vs_0_GO <- GO_results(df_dds_ko_20_vs_0, "up")
save_GO_results(up_BP_ko_20_vs_0_GO, "up")
down_BP_ko_20_vs_0_GO <- GO_results(df_dds_ko_20_vs_0, "down")
save_GO_results(down_BP_ko_20_vs_0_GO, "down")
up_MF_ko_20_vs_0_GO <- GO_results(df_dds_ko_20_vs_0, "up", ont_category = "MF")
save_GO_results(up_MF_ko_20_vs_0_GO, "up")
down_MF_ko_20_vs_0_GO <- GO_results(df_dds_ko_20_vs_0, "down", ont_category = "MF")
save_GO_results(down_MF_ko_20_vs_0_GO, "down")
up_CC_ko_20_vs_0_GO <- GO_results(df_dds_ko_20_vs_0, "up", ont_category = "CC")
save_GO_results(up_CC_ko_20_vs_0_GO, "up")
down_CC_ko_20_vs_0_GO <- GO_results(df_dds_ko_20_vs_0, "down", ont_category = "CC")
save_GO_results(down_CC_ko_20_vs_0_GO, "down")
# KO_0 vs. KO_40
up_BP_ko_40_vs_0_GO <- GO_results(df_dds_ko_40_vs_0, "up")
save_GO_results(up_BP_ko_40_vs_0_GO, "up")
down_BP_ko_40_vs_0_GO <- GO_results(df_dds_ko_40_vs_0, "down")
save_GO_results(down_BP_ko_40_vs_0_GO, "down")
up_MF_ko_40_vs_0_GO <- GO_results(df_dds_ko_40_vs_0, "up", ont_category = "MF")
save_GO_results(up_MF_ko_40_vs_0_GO, "up")
down_MF_ko_40_vs_0_GO <- GO_results(df_dds_ko_40_vs_0, "down", ont_category = "MF")
save_GO_results(down_MF_ko_40_vs_0_GO, "down")
up_CC_ko_40_vs_0_GO <- GO_results(df_dds_ko_40_vs_0, "up", ont_category = "CC")
save_GO_results(up_CC_ko_40_vs_0_GO, "up")
down_CC_ko_40_vs_0_GO <- GO_results(df_dds_ko_40_vs_0, "down", ont_category = "CC")
save_GO_results(down_CC_ko_40_vs_0_GO, "down")
# KO_0 vs. KO_60
up_BP_ko_60_vs_0_GO <- GO_results(df_dds_ko_60_vs_0, "up")
save_GO_results(up_BP_ko_60_vs_0_GO, "up")
down_BP_ko_60_vs_0_GO <- GO_results(df_dds_ko_60_vs_0, "down")
save_GO_results(down_BP_ko_60_vs_0_GO, "down")
up_MF_ko_60_vs_0_GO <- GO_results(df_dds_ko_60_vs_0, "up", ont_category = "MF")
save_GO_results(up_MF_ko_60_vs_0_GO, "up")
down_MF_ko_60_vs_0_GO <- GO_results(df_dds_ko_60_vs_0, "down", ont_category = "MF")
save_GO_results(down_MF_ko_60_vs_0_GO, "down")
up_CC_ko_60_vs_0_GO <- GO_results(df_dds_ko_60_vs_0, "up", ont_category = "CC")
save_GO_results(up_CC_ko_60_vs_0_GO, "up")
down_CC_ko_60_vs_0_GO <- GO_results(df_dds_ko_60_vs_0, "down", ont_category = "CC")
save_GO_results(down_CC_ko_60_vs_0_GO, "down")
# KO_0 vs. KO_80
up_BP_ko_80_vs_0_GO <- GO_results(df_dds_ko_80_vs_0, "up")
save_GO_results(up_BP_ko_80_vs_0_GO, "up")
down_BP_ko_80_vs_0_GO <- GO_results(df_dds_ko_80_vs_0, "down")
save_GO_results(down_BP_ko_80_vs_0_GO, "down")
up_MF_ko_80_vs_0_GO <- GO_results(df_dds_ko_80_vs_0, "up", ont_category = "MF")
save_GO_results(up_MF_ko_80_vs_0_GO, "up")
down_MF_ko_80_vs_0_GO <- GO_results(df_dds_ko_80_vs_0, "down", ont_category = "MF")
save_GO_results(down_MF_ko_80_vs_0_GO, "down")
up_CC_ko_80_vs_0_GO <- GO_results(df_dds_ko_80_vs_0, "up", ont_category = "CC")
save_GO_results(up_CC_ko_80_vs_0_GO, "up")
down_CC_ko_80_vs_0_GO <- GO_results(df_dds_ko_80_vs_0, "down", ont_category = "CC")
save_GO_results(down_CC_ko_80_vs_0_GO, "down")2/25/25
The following function produces a CSV file/table that contains the count of gene ontology description terms from the BP, MF, and CC annotations. It does so by parsing through the associated “up” and “down” regulated gene annotation files and inserts the values into their appropriate rows/columns. It looks for patterns denoted by the title of the CSV files saved in the previous step of annotating genes. It searches a specific directory (my personal one is the default) and searches for more specific directories that separated the “up” and “down” results (the default are set to my naming convention). The tables will be saved to their own new folder (named GO_term_counts) within your chosen data_file_path directory.
The function’s arguments are:
Note: there is a lot of debugging code in this function to see how it works as I struggled writing it for quite some time.
count_GO_terms <- function(data_file_path = "../data/arabidopsis_GO_results",
comparison_pattern,
up_regulated_data = "up_results",
down_regulated_data = "down_results") {
# define the GO categories
go_categories <- c("BP", "MF", "CC")
# initialize the results matrix
results_matrix <- matrix(0, nrow = length(go_categories), ncol = 2,
dimnames = list(go_categories, c("up", "down")))
# helper function used to process files in a given folder and direction ("up" or "down")
process_files <- function(folder, direction, results_matrix) {
for (category in go_categories) {
# build the exact file name pattern
file_pattern <- paste0(direction, "_", category, "_", comparison_pattern, "_GO.csv") # very exact to the naming convention used for the annotating genes step
# list out files matching the name pattern (should be only one per category (BP, MF, CC))
files <- list.files(folder, pattern = file_pattern, full.names = TRUE)
if (length(files) > 0) {
file <- files[1] # take the initial match (should only be one anyways)
# read the matching CSV file
data <- read.csv(file, stringsAsFactors = FALSE)
# count rows where p.adjust < 0.05 (will match the number of significant GO Description terms); ignores NA values
count <- sum(data$p.adjust < 0.05, na.rm = TRUE)
# DEBUG: check the count and where it is being assigned
print(paste("Updating matrix for category:", category, "and direction:", direction))
print(paste("Count to assign:", count))
# store result in matrix
if (direction == "up") {
results_matrix[category, "up"] <- count
} else if (direction == "down") {
results_matrix[category, "down"] <- count
}
# DEBUG: display the matrix after each count assignment
print(results_matrix)
} else {
# if no files are found (i.e. incorrect pattern input), print a warning error
warning(paste("No matching file found for", direction, category, "in", folder))
}
}
return(results_matrix) # return updated matrix
}
# define the output path
output_dir <- file.path(data_file_path, "GO_term_counts")
# create the directory (if it doesn't already exist)
dir.create(output_dir, showWarnings = FALSE)
# process both the "up" and "down" CSV file results
results_matrix <- process_files(file.path(data_file_path, up_regulated_data), "up", results_matrix)
results_matrix <- process_files(file.path(data_file_path, down_regulated_data), "down", results_matrix)
# DEBUG: display the final results matrix
print("Final results_matrix:")
print(results_matrix)
# define the output path and save the results once the entire matrix is complete
output_file <- file.path(output_dir, paste0(comparison_pattern, "_GO_counts.csv"))
write.csv(as.data.frame(results_matrix), output_file, row.names = TRUE)
}The following code chunks are the running of the count_GO_terms() function to produce a table for each comparison.
KO Treatment vs. WT Treatment:
## [1] "Updating matrix for category: BP and direction: up"
## [1] "Count to assign: 72"
## up down
## BP 72 0
## MF 0 0
## CC 0 0
## [1] "Updating matrix for category: MF and direction: up"
## [1] "Count to assign: 13"
## up down
## BP 72 0
## MF 13 0
## CC 0 0
## [1] "Updating matrix for category: CC and direction: up"
## [1] "Count to assign: 22"
## up down
## BP 72 0
## MF 13 0
## CC 22 0
## [1] "Updating matrix for category: BP and direction: down"
## [1] "Count to assign: 112"
## up down
## BP 72 112
## MF 13 0
## CC 22 0
## [1] "Updating matrix for category: MF and direction: down"
## [1] "Count to assign: 6"
## up down
## BP 72 112
## MF 13 6
## CC 22 0
## [1] "Updating matrix for category: CC and direction: down"
## [1] "Count to assign: 39"
## up down
## BP 72 112
## MF 13 6
## CC 22 39
## [1] "Final results_matrix:"
## up down
## BP 72 112
## MF 13 6
## CC 22 39
# WT_40 vs. KO_40
count_GO_terms(comparison_pattern = "ko_40_vs_wt_40")
# WT_60 vs. KO_60
count_GO_terms(comparison_pattern = "ko_60_vs_wt_60")
# WT_80 vs. KO_80
count_GO_terms(comparison_pattern = "ko_80_vs_wt_80")Only Controls:
Only Wild-Type Samples:
# WT_0 vs. WT_20
count_GO_terms(comparison_pattern = "wt_20_vs_0")
# WT_0 vs. WT_40
count_GO_terms(comparison_pattern = "wt_40_vs_0")
# WT_0 vs. WT_60
count_GO_terms(comparison_pattern = "wt_60_vs_0")
# WT_0 vs. WT_80
count_GO_terms(comparison_pattern = "wt_80_vs_0")Only Knockout-Line Samples:
Double check that this works correctly by looking at the values directly; I think it struggles when there are matching values
The following function uses a similar logic as the count_GO_terms() function. The difference is that instead of making a table with significant GO term counts, it makes a horizontal bar plot that displays the most significant GO terms for each category (BP, MF, CC) across both the up and down regulated gene annotations for a particular comparison. In other words, it parses through all associated data sets for the chosen comparison_pattern and finds the smallest p.adjust values across the up and down regulated genes combined. It then separates them by category.
The function’s arguments are the same as the count_GO_terms() function, however, there is now an option to select how many significant terms we want in the plot.
plot_top_GO_terms <- function(data_file_path = "../data/arabidopsis_GO_results",
comparison_pattern,
top_n = 5,
up_regulated_data = "up_results",
down_regulated_data = "down_results") {
# define the GO categories
go_categories <- c("BP", "MF", "CC")
all_data <- data.frame() # initialize a data frame to store all GO term info
# helper function used to process files in a given folder and direction ("up" or "down")
process_files <- function(folder, direction) {
category_data <- data.frame() # initialize a data frame for the category info
for (category in go_categories) {
# build the exact file name pattern
file_pattern <- paste0(direction, "_", category, "_", comparison_pattern, "_GO.csv")
# list out files matching the name pattern (should be only one per category (BP, MF, CC))
files <- list.files(folder, pattern = file_pattern, full.names = TRUE)
# take the initial match (should only be one anyways)
if (length(files) > 0) {
data <- read.csv(files[1], stringsAsFactors = FALSE)
# filter the data only keep non-NA p.adjust values and create Category and Regulation columns
data <- data |>
dplyr::filter(!is.na(p.adjust)) |>
dplyr::mutate(Category = category, Regulation = direction)
# bind the data for a category to the category_data data frame
category_data <- dplyr::bind_rows(category_data, data)
} else {
# if no files are found (i.e. incorrect pattern input), print a warning error
warning(paste("No matching file found for", direction, category, "in", folder))
}
}
return(category_data) # return the collected data for the given direction
}
# process both the "up" and "down" CSV file results
up_data <- process_files(file.path(data_file_path, up_regulated_data), "up")
down_data <- process_files(file.path(data_file_path, down_regulated_data), "down")
# combine the up-regulated and down-regulated data into a single data frame (in the form of all_data)
all_data <- dplyr::bind_rows(up_data, down_data)
# if no significant (< 0.05) GO terms are found, display error
if (nrow(all_data) == 0) {
stop("No significant GO terms found for given comparison pattern.")
}
# select the top 5 GO terms
top_go_data <- all_data |>
group_by(Category) |>
arrange(p.adjust) |> # sort by smallest p.adjust to get the most significant terms (ascending order)
slice_head(n = top_n) |> # select the top GO terms per category (choose how many when calling the function)
ungroup() |>
mutate(Label = paste0(ID, ": ", Description)) # create a label for each GO term (ID: Description)
# plot the top GO terms
ggplot(top_go_data, aes(x = reorder(Label, p.adjust), y = -log10(p.adjust), fill = Regulation)) +
geom_col(position = position_dodge(width = 0.7)) + # use position_dodge to place up/down bars side by side
coord_flip() + # flip coordinates so the bars are horizontal
facet_grid(Category ~ ., scales = "free_y", space = "free") + # categories grouped together with free y-scales
scale_fill_manual(values = c("up" = "lightblue", "down" = "pink")) + # set colors for up and down regulation
labs(title = paste(top_n, "Most Significant GO Terms Across ONT Categories for", comparison_pattern),
x = "GO Term (ID: Description)",
y = "-log10(p.adjust)",
fill = "Regulation") +
theme_minimal() +
theme(strip.text.y = element_text(angle = 0)) # keep facet labels horizontal (for categories)
}Example run using the control group comparison:
The following code is going to explore the various results stemming from our differential gene expression analysis through a few visualization techniques.
3/18/25
Using the a dds object comparison and its results object produced earlier in the script, we will be making a heat map that demonstrates the pattern regarding the highest 50 and lowest 50 differential expressed genes according to their log2FoldChange value. This is simply to visualize the pattern.
For this initial example, we will be using dds_controls and res_dds_controls which are the dds objects comparing ko_0 vs. wt_0. I mainly started with this because it did not need any further contrasting but if we want to test more we would have to contrast accordingly to the samples.
This heat map filters out for padj < 0.05 and omits the NA values
found in the padj column:
This heat map lacks filtering for padj < 0.05 (i.e. includes
everything):
Should we filter the padj or leave it as is? Should we use normalized counts? I’m assuming yes!
3/17/25
The following code will utilize KEGG Pathway Analysis to both find and visualize certain biological pathways and how they are affected by the up and down regulation of certain genes.
Use the following packages (as well as the others used in this script so far) to conduct this analysis.
library(DOSE)
library(pathview)
# search database to make sure our organism is actually available (newsflash: it is)
search_kegg_organism('Arabidopsis thaliana', by = 'scientific_name')| kegg_code | scientific_name | common_name | |
|---|---|---|---|
| 565 | ath | Arabidopsis thaliana | thale cress |
IGNORE THE SECTION TITLED ‘CONVERT TAIR IDS TO ENTREZ IDS’ BECAUSE ACCORDING TO THE FOLLOWING:
## ktax.id tax.id kegg.code
## "T00041" "3702" "ath"
## scientific.name common.name entrez.gnodes
## "Arabidopsis thaliana" "thale cress" "0"
## kegg.geneid ncbi.geneid ncbi.proteinid
## "AT2G23820" "816914" "NP_179962"
## uniprot
## "F4IMN2"
WE ACTUALLY DO NOT NEED THE ENTREZID IN ORDER TO CONDUCT KEGG PATHWAY ANALYSIS USING ENRICHKEGG() AND CLUSTERPROFILER FOR ARABIDOPSIS!
THE FOLLOWING CODE IS SIMPLY FOR REFERENCE:
# choose any of the Deseq2 resulting data sets (I'm testing knockout-line vs. wild-type controls)
# copy original data frame and remove tailing ".1" to match TAIR format
df_dds_controls_ko_vs_wt_entrez <- df_dds_controls_knockout_line_vs_wild_type |>
mutate(GeneID = sub("\\.\\d+$", "", GeneID))
# extract the GeneID column
tair_ids_controls <- df_dds_controls_ko_vs_wt_entrez$GeneID
# convert TAIR IDs to ENTREZ IDs
entrez_mapping_controls <- select(org.At.tair.db, keys = tair_ids_controls, keytype = "TAIR", columns = "ENTREZID")
# when matching TAIR IDs to ENTREZ IDs, a warning denotes that there are many:1 mapping instances of TAIR to ENTREZ (i.e. various instances of the same TAIR values being matched to the same ENTREZ values). This is due to transcript variety (.1 vs. .3, but I think that should be fine). Is it?
entrez_mapping_controls |>
dplyr::count(TAIR) |>
dplyr::filter(n > 1)
# merge with newly made _entrez data frame to keep track of IDs
df_dds_controls_ko_vs_wt_entrez <- merge(df_dds_controls_ko_vs_wt_entrez,
entrez_mapping_controls,
by.x = "GeneID", by.y = "TAIR", all.x = TRUE)
# move the newly created ENTREZID next to the GeneID column and remove duplicates
df_dds_controls_ko_vs_wt_entrez <- df_dds_controls_ko_vs_wt_entrez |>
dplyr::select(GeneID, ENTREZID, everything()) |>
distinct() # collapsed a few duplicate entries due to many:1 mapping
# View(df_dds_controls_ko_vs_wt_entrez)IGNORE THIS AS WELL!
# add a new column (regulation) that signifies UP or DOWN according to the log2FoldChange
df_dds_controls_ko_vs_wt_entrez <- df_dds_controls_ko_vs_wt_entrez |>
mutate(regulation = case_when(
log2FoldChange > 0 ~ "UP",
log2FoldChange < 0 ~ "DOWN"
))
# omits rows with NA values in the ENTREZID column
df_dds_controls_ko_vs_wt_entrez <- df_dds_controls_ko_vs_wt_entrez[!is.na(df_dds_controls_ko_vs_wt_entrez$ENTREZID), ]# select whichever Deseq2 results you like (I'm testing knockout-line vs. wild-type controls)
# remove tailing ".1" to match TAIR format (remember: padj is already < 0.05 so we don't have to filter)
dds_kegg_controls <- df_dds_controls_knockout_line_vs_wild_type |>
mutate(GeneID = sub("\\.\\d+$", "", GeneID))
# prepare the gene list for KEGG analysis
geneList_controls <- dds_kegg_controls$log2FoldChange # makes vector with log2FoldChange values
names(geneList_controls) <- dds_kegg_controls$GeneID # adds GeneID to the vector # run KEGG enrichment analysis on the entire (both up and down regulated genes) geneList_controls and produce a list of over-represented pathways we can then visualize in the next step
kegg_ora_analysis_controls <- enrichKEGG(gene = names(geneList_controls),
organism = 'ath',
pvalueCutoff = 0.05)
# View(kegg_ora_analysis_controls)We can then visualize one of the particular pathways:
Remove ‘eval = FALSE’ when it starts working . . .
# use pathview() to visualize the pathway
pathview(gene.data = geneList_controls, # input gene list (log2FoldChange)
pathway.id = "ath00500", # pathway ID we want to visualize
species = "ath", # arabidopsis species
out.suffix = "pathway")
# can use the following to verify if the actual pathway even exists
# browseKEGG(kegg_analysis_controls, 'ath00500')THE FOLLOWING IS NOT WORKING…or is it:
# remove duplicates by keeping the first occurrence
geneList_controls_unique <- geneList_controls[!duplicated(names(geneList_controls))]
# sort the gene list in decreasing order of log2FoldChange
geneList_controls_sorted <- sort(geneList_controls_unique, decreasing = TRUE)
# NOT WORKING!!!
# run KEGG pathway gene set enrichment analysis
kegg_gsea_controls <- gseKEGG(geneList = geneList_controls_sorted,
organism = 'ath',
pvalueCutoff = 0.05) 3/18/25
The following function, plot_gene_counts(), is a makeshift function that takes arguements and feeds them into the Deseq2 built-in function called plotCounts(). This allows us to visualize specific genes and their counts found in specific dds objects and contrasts. It takes a few arguements:
These parameters allow us to use the few dds objects made earlier in this script and stil contrast specific samples accordingly.
The function can take a single gene or a list of genes and will simply facet wrap it so that we can avoid making a ton of unnecessary plots for the same contrast conditions and varying genes. I would only suggest to do 2 or 3 genes at a time.
In this particular script, the choice of re-leveled dds objects we have are:
They are also found in the title of each plot distinguishing the plots from one another (especially for dds_ko and dds_wt which only use numbers for their x-axis).
Use any of these and specify according to a particular gene, intgroup which is a colData column used in the original contrast (group vs. treatment vs. genotype), and specific samples for filter_groups (wt_0 vs. ko_0).
Compare the clustering of points to the log2FoldChange of your chosen gene(s). Remember that the log2FoldChange represents up or down regulation relative to the reference level which has been set as the wild-type control, wild-type variant, or knockout-line control where appropriate. Then, physically look at the log2FoldChange value and if, for example, it is negative, make sure its clustering is lower than the reference level clustering on the graph. Vice versa if positive. If both sets of clustering are close together, that is indicative of a fairly small padj value. We are just trying to make sure that the log2FoldChange is accurately demonstrating up or down regulation via the normalized counts.
Lastly, I’ve included a red dot that is representative of the mean value between the triplicate for each sample. You could remove that if you’d like.
Should I use normalized counts like plotCounts() normally does or should I use the raw counts as accomplished by changing: plot_data <- plotCounts(dds, gene = “AT1G02280.1”, intgroup = “group”, returnData = TRUE, normalized = FALSE)
plot_gene_counts <- function(dds, genes, intgroup, filter_groups) {
# pull name of the dds object
dds_name <- deparse(substitute(dds))
# extract plot data for multiple genes
plot_data_list <- lapply(genes, function(gene) {
plot_data <- plotCounts(dds, gene = gene, intgroup = intgroup, returnData = TRUE)
plot_data$gene <- gene # Add gene identifier
plot_data
})
# combine into a single data frame
plot_data_combined <- bind_rows(plot_data_list)
# filter based on specified groups
plot_data_filtered <- plot_data_combined |>
filter(.data[[intgroup]] %in% filter_groups)
# generate plot title dynamically
plot_title <- paste("Gene Counts for", paste(filter_groups, collapse = " vs. "),
"\nData from:", dds_name)
# create the plot with facet_wrap
ggplot(plot_data_filtered, aes_string(x = intgroup, y = "count")) +
geom_jitter(width = 0.2, size = 3, alpha = 0.6) +
stat_summary(fun = "mean", geom = "point", size = 4, color = "red") +
theme_minimal(base_size = 14) +
labs(title = plot_title, y = "Normalized Counts", x = "Condition") +
facet_wrap(~gene, scales = "free_y") +
theme(
strip.background = element_rect(fill = "lightgray", color = "black", size = 1.5), # Highlight facet labels
strip.text = element_text(face = "bold", size = 12), # Make facet labels bold
panel.border = element_rect(color = "black", fill = NA, size = 1) # Add borders to panels
)
}
# example when using a single gene from dds_treatment (ko_20 vs. wt_20)
plot_gene_counts(dds_treatment, genes = "AT1G19350.4",
intgroup = "group", filter_groups = c("ko_20", "wt_20"))# example when using multiple genes from dds_treatment (ko_20 vs. wt_20)
plot_gene_counts(dds_treatment, genes = c("AT1G02280.1", "AT1G19350.4"),
intgroup = "group", filter_groups = c("ko_20", "wt_20"))Since we’ve already done 2 genes for ko_20 vs. wt_20 above, let’s continue and do at least 2 genes for each contrast we’ve done thus far.
The remaning KO Treatment vs. WT Treatment contrasts:
Only Controls:
# code for if you ever want to view the normalized counts used for plotting the graph of a specific gene and compare to plot; I've compared and it has looked great so far
counts(dds_controls, normalized = TRUE)["AT1G02640.1", ] # specific to a particular gene ## Col01_1_1 Col01_1_2 Col01_1_3 ST35_1_1 ST35_1_2 ST35_1_3
## 5674.450 5194.160 5408.201 2042.590 2755.125 3249.671
Only Wild-Type Samples:
Only Knockout-Line Samples: